Neurosynth electrode word matrix

Download Neurosynth and associated abstracts from Pubmed.



In [1]:

    
from everything import *



In [2]:

    
from brede.data.neurosynth import NeurosynthDatabase
from brede.data.pubmed import Pubmed
from brede.data.words import CognitiveWords
from brede.core.matrix import Matrix
from brede.data.sbs2 import SBS2Data



In [3]:

    
# Log to logfile named 'brede.log'
import logging
logger = logging.getLogger()

file_handler = logging.FileHandler(filename='brede.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.setLevel(logging.DEBUG)



In [4]:

    
pubmed = Pubmed()



In [5]:

    
# Load Neurosynth
nd = NeurosynthDatabase()
nd_database = nd.database()



In [6]:

    
# Get abstracts for Neurosynth papers from PubMed
# This will take some hours time 
medlines = pubmed.get_medlines(set(nd_database.id))



In [7]:

    
# Find keywords in abstracts and add them to a list of list of words
cognitive_words = CognitiveWords()
corpus = []
for n, medline in enumerate(medlines):
    abstract = medline.get('AB', '').lower()
    keywords = cognitive_words.find_all(abstract)
    corpus.append(keywords)
    logger.debug(('Iterating over medline abstracts '
                  'for keyword extraction: {}').format(n))



In [8]:

    
# Corpus-wide keywords
all_keywords = [word for wordlist in corpus for word in wordlist]
all_unique_keywords = set(all_keywords)



In [9]:

    
# Build bag-of-phrases matrix
bag_of_phrases = pd.DataFrame(index=[medline['PMID'] for medline in medlines], 
                              columns=list(all_unique_keywords)).fillna(0)
for n, (medline, keywords) in enumerate(zip(medlines, corpus)):
    for keyword in keywords:
        bag_of_phrases.ix[n, keyword] += 1
        if ' ' in keyword:
            keyword_parts = keyword.split()
            for keyword_part in keyword_parts:
                if keyword_part in all_unique_keywords:
                    bag_of_phrases.ix[n, keyword_part] += 1 / len(keyword_parts)
    logger.debug(('Iterating over medline abstracts '
                 'for matrix construction: {}').format(n))



In [10]:

    
# Scale bag-of-phrases matrix with IDF
scaled = Matrix(bag_of_phrases).idf()



In [11]:

    
# Read Smartphone Brain Scanner surface
sbs2_data = SBS2Data()
surface = sbs2_data.surface()



In [12]:

    
grouped = nd_database[['id', 'x', 'y', 'z']].groupby('id')
v = np.zeros((len(grouped), surface.vertices.shape[0]))
sigma = 10
norm1 = 1 / (sigma * math.sqrt(2 * math.pi))
norm2 = -1 / (2 * sigma ** 2)



In [13]:

    
# Cortexification of study coordinates
for n in range(len(grouped)):
    coords = grouped.get_group(grouped.groups.keys()[n]).ix[:, ['x','y', 'z']]
    p = 0 
    for index, coord in coords.iterrows():
        p += norm1 * np.exp(norm2 * np.sum((surface.vertices - coord.values) ** 2, axis=1))
    p /= math.sqrt(len(coord))
    v[n, :] = p 
    if not n % 100:
        logger.debug(('Iterating over medline abstracts '
                      'for computing Talairach coordinate load: {}').format(n))



In [14]:

    
product = v.T.dot(scaled)
product_matrix = Matrix(product, columns=bag_of_phrases.columns)
product_matrix.shape









    Out[14]:





(1028, 1135)



In [15]:

    
product_matrix.to_csv('neurosynth electrode word matrix.csv')



In [ ]: